Exploratory Data Analysis

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pandas_profiling
import random

import plotly.io as pio
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objs as go
from plotly import tools, subplots
import plotly
plotly.offline.init_notebook_mode()
pio.templates.default = "plotly_dark"
In [2]:
# load data
data = pd.read_csv('/home/prodigalson/J Files/Project - Time Series/data.csv', index_col=None, delimiter=";")
data['DOR'] = pd.to_datetime(data['DOR'])
data['Age'] = data['Age'].round().astype('Int64')
data.fillna({'Age': 0}, inplace=True)

Sample data

In [3]:
ff.create_table(data.head())

pandas profile report

In [4]:
data.profile_report()
Out[4]:

In [5]:
# month wise cases
data['MnthWse'] = data['DOR'].map(lambda x: x.strftime('%Y-%m'))
mCnt = data[data['Sex'] == 'M']['MnthWse'].value_counts().sort_index()
fCnt = data[data['Sex'] == 'F']['MnthWse'].value_counts().sort_index()
totl = data['MnthWse'].value_counts().sort_index()
In [6]:
# year wise cases
data['YrWse'] = data['DOR'].map(lambda x: x.strftime('%Y'))
ymCnt = data[data['Sex'] == 'M']['YrWse'].value_counts().sort_index()
yfCnt = data[data['Sex'] == 'F']['YrWse'].value_counts().sort_index()
ytotl = data['YrWse'].value_counts().sort_index()
In [7]:
ymCnt
Out[7]:
2013    2280
2014     739
2015     531
2016    1093
2017    4237
2018     189
Name: YrWse, dtype: int64

Reported cases over time

In [8]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=mCnt.index, y=mCnt,
                         mode='lines',
                         name='Male'))
fig.add_trace(go.Scatter(x=fCnt.index, y=fCnt,
                         mode='lines',
                         name='Female'))
fig.add_trace(go.Scatter(x=totl.index, y=totl,
                         mode='lines',
                         name='Total'))
fig.update_layout(
    title={'text': "Trivandrum - Reported Cases Over Time",
           'y': 0.95,
           'x': 0.05,
           'xanchor': 'left',
           'yanchor': 'top'},
    xaxis_title="Year",
    yaxis_title="Number of Patients",
    xaxis_tickformat='%Y',

    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)
fig.update_xaxes(rangeslider_visible=True)
fig.show()
In [9]:
# data summary
sumry = pd.DataFrame(data.MnthWse.value_counts().sort_index()).reset_index()
sumry.columns = ['MnthWse', 'Cnt']
sumry['MnthWse'] = pd.to_datetime(sumry['MnthWse'], format='%Y-%m')
sumry['Mnth'] = sumry['MnthWse'].map(lambda x: x.strftime('%m'))
sumry['Year'] = sumry['MnthWse'].map(lambda x: x.strftime('%Y'))

dataSumry = pd.pivot_table(sumry, values="Cnt",
                           columns="Year", index="Mnth")
In [10]:
dataSumry.columns
Out[10]:
Index(['2013', '2014', '2015', '2016', '2017', '2018'], dtype='object', name='Year')

Pivot table

In [11]:
ff.create_table(dataSumry.reset_index())

Reported cases in each year

In [12]:
fig = go.Figure(data=[
    go.Bar(name='Female',
           x=dataSumry.columns,
           y=yfCnt,
           marker_color='DarkOrange'),
    go.Bar(name='Male',
           x=dataSumry.columns,
           y=ymCnt,
           marker_color='dodgerblue')
    
])
total_labels = [{"x": x, "y": total*1.05, "text": str(total), "showarrow": False} for x, total in zip(dataSumry.columns, ytotl)]


fig.update_layout(barmode='stack',annotations=total_labels,
                  title={
                      'text': "Cases in each Year (Gender Wise)",
                      'y': 0.95,
                      'x': 0.03,
                      'xanchor': 'left',
                      'yanchor': 'top'},
                  font=dict(
                      family="Courier New, monospace",
                      size=15,
                      color="#7f7f7f"
                  ))
fig.show()

Reported cases over time

In [13]:
fig = go.Figure()
for i in dataSumry.columns:
    fig.add_trace(go.Scatter(x=dataSumry.index, y=dataSumry[i],
                             mode='lines',
                             name=i))
fig.update_layout(
    title={'text': "Trivandrum - Reported Cases Over Time (Year wise)",
          'y': 0.9,
          'x': 0.05,
          'xanchor': 'left',
          'yanchor': 'top'},
    yaxis_title="Number of Patients",
    xaxis=dict(
        tickmode='array',
        tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),

    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)
fig.show()

Box plot of cases

In [14]:
N = 12
mnth = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

fig = go.Figure(data=[go.Box(
                y=dataSumry.iloc[i],
                marker_color=c[i],
                name=j,
) for i, j in zip(range(int(N)), mnth)])

fig.update_layout(title={'text': "Box plot of cases (Month Wise)",
                         'y': 0.95,
                         'x': 0.03,
                         'xanchor': 'left',
                         'yanchor': 'top'},
                         font=dict(
                         family="Courier New, monospace",
                         size=15,
                         color="#7f7f7f"
),
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=True),
    yaxis=dict(zeroline=False, gridcolor='blue'),
)

fig.show()

Distribution of Gender

In [15]:
labels = ["Male", "Female"]
values = data['Sex'].value_counts()
fig = go.Figure()
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.6)])
fig.update_layout(
    title={'text': "Distribution of Gender",
           'y': 0.9,
           'x': 0.05,
           'xanchor': 'left',
           'yanchor': 'top'},
    font=dict(family="Courier New, monospace",
              size=15,
              color="#7f7f7f")
    )
fig.show()

Distribution of age

In [16]:
hist_data = [data['Age'].tolist()]
group_labels = ['Age']

fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(
    title={'text': "Distribution of Age",
           'y': 0.9,
           'x': 0.05,
           'xanchor': 'left',
           'yanchor': 'top'},   
    xaxis_title="Age",
    font=dict(family="Courier New, monospace",
             size=15,
             color="#7f7f7f")
)
fig.show()
In [17]:
Area = (data['Area'].value_counts().to_frame().reset_index()
        ).sort_values('Area', ascending=False)
Area.columns = ['Loc', 'Cnt']
In [18]:
mCnt = data[data['Sex'] == 'M']['Area'].value_counts().nlargest(10).sort_values(ascending=True)
fCnt = data[data['Sex'] == 'F']['Area'].value_counts().nlargest(10).sort_values(ascending=True)

Top 10 areas with most reported cases

In [19]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=mCnt.index,
    x=mCnt,
    name='Male',
    orientation='h',
    text=mCnt,
    marker=dict(
        color='rgba(31, 58, 147, 1)',
        line=dict(color='rgba(103, 65, 114, 1)', width=0.3)
    )
))
fig.add_trace(go.Bar(
    y=fCnt.index,
    x=fCnt,
    name='Female',
    orientation='h',
    marker=dict(
        color='rgba(241, 90, 34, 1)',
        line=dict(color='rgba(103, 65, 114, 1)', width=0.3)
    )
))

fig.update_layout(barmode='stack',
                  title={
                      'text': "Top 10 Areas with Reported Cases (Gender Wise)",
                      'y': 0.95,
                      'x': 0.03,
                      'xanchor': 'left',
                      'yanchor': 'top'},
                  font=dict(
                      family="Courier New, monospace",
                      size=15,
                      color="#7f7f7f"
                  ))
fig.show()
In [20]:
# age groups
ageCnt = data['Age']
fageCnt = data[data['Sex'] == 'F']['Age']
mageCnt = data[data['Sex'] == 'M']['Age']
atCnt = pd.cut(ageCnt,  [0, 20, 40, 60, 80, 100]).value_counts()
afCnt = pd.cut(fageCnt, [0, 20, 40, 60, 80, 100]).value_counts()
amCnt = pd.cut(mageCnt, [0, 20, 40, 60, 80, 100]).value_counts()

Cases in each age group (%)

In [21]:
subgroup_size = []
for i, j in zip(amCnt, afCnt):
    subgroup_size.append(i)
    subgroup_size.append(j)
    
lbl = ['Below 20', '20 - 40', '40 - 60', '60 - 80', 'Above 80']
    
agegrp = go.FigureWidget()
agegrp.add_trace(go.Pie(values=atCnt,
                     labels=lbl,
                     domain={'x': [0.2, 0.8], 'y': [0.1, 0.9]},
                     hole=0.5,
                     direction='clockwise',
                     name='Total',
                     sort=False,
                     marker={'colors': [' #ec3e40 ', ' #4c54c2 ', ' #32a033 ', ' #900C3F', ' #ffff00 ']}))

agegrp.add_trace(go.Pie(values=subgroup_size,
                     labels=['>20-M', '>20-F', '20<M<40', '20<F<40', '40<M<60',
                             '40<F<60', '60<M<80', '60<F<80', '<80M', '<80M'],
                     domain={'x': [0.1, 0.9], 'y': [0, 1]},
                     hole=0.75,
                     direction='clockwise',
                     name='Gender Wise',
                     sort=False,
                     marker={'colors': ['#ffffff', '#999966']*5},
                     showlegend=False))
agegrp.update_layout(
    title={
        'text': "Cases in each Age Group (%)",
        'y': 0.95,
        'x': 0.03,
        'xanchor': 'left',
        'yanchor': 'top'},
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    ))
agegrp.show()
In [22]:
# race bar plot
In [23]:
data['Year'] = data['DOR'].map(lambda x: x.strftime('%Y'))

qu = []
for z in mCnt.index:
    uq = []
    uq.append(data[data['Area'] == z]['Year'].value_counts().sort_index())
    qu.append(uq)

alph = []
for z in range(6):
    for i in qu:
        for j in i:
            alph.append(j[z])
In [24]:
def duplicate(testList, n):
    return [ele for ele in testList for _ in range(n)]
In [25]:
df = pd.DataFrame({"Name": ['BALARAMAPURAM', 'POONTHURA', 'KALLIYOOR', 'THIRUVALLAM', 'NEMOM',
                            'VATTIYOORKKAVU', 'PALLICHAL', 'VIZHINJAM', 'KARAKULAM', 'CORPORATION']*6,
                   "Year": duplicate([2013, 2014, 2015, 2016, 2017, 2018], 10),
                   "Number": alph})
In [26]:
# mapping colors
def name_to_color(names, r_min=0, r_max=255, g_min=0, g_max=255, b_min=0, b_max=255):
    mapping_colors = dict()

    for name in names.unique():
        red = random.randint(r_min, r_max)
        green = random.randint(g_min, g_max)
        blue = random.randint(b_min, b_max)
        rgb_string = 'rgb({}, {}, {})'.format(red, green, blue)

        mapping_colors[name] = rgb_string

    return mapping_colors

mapping_colors = name_to_color(df.Name, 125, 255, 0, 185, 0, 185)

df['Color'] = df['Name'].map(mapping_colors)
In [27]:
# fn to create list of frames
def frames_animation(df, title):
    list_of_frames = []
    initial_year = df['Year'].min()
    final_year = (df['Year'].max()+1)

    for year in range(initial_year, final_year):
        data = df[df['Year'] == year]
        list_of_frames.append(go.Frame(data=[go.Bar(x=data['Name'], y=data['Number'],
                                                    marker_color=data['Color'], hoverinfo='none',
                                                    textposition='outside', texttemplate='%{x}<br>%{y}',
                                                    cliponaxis=False)],
                                                    layout=go.Layout(font={'size': 14,
                                                              'color': "white"},
                                                        plot_bgcolor='#111111',
                                                        xaxis={
                                                            'showline': False, 'visible': False},
                                                        yaxis={
                                                            'showline': False, 'visible': False},
                                                        bargap=0.15,
                                                        title=title + str(year))))
    return list_of_frames
In [28]:
# fn to create bar race plot
def bar_race_plot(df, title, list_of_frames):
    initial_year = df['Year'].min()
    initial_names = df[df['Year'] == initial_year].Name
    initial_numbers = df[df['Year'] == initial_year].Number
    initial_color = df[df['Year'] == initial_year].Color
    range_max = df['Number'].max()

    fig = go.Figure(
        data=[go.Bar(x=initial_names, y=initial_numbers,
                     marker_color=initial_color, hoverinfo='none',
                     textposition='outside', texttemplate='%{x}<br>%{y}',
                     cliponaxis=True)],
        layout=go.Layout(font={'size': 14}, plot_bgcolor=' #111111',
                         xaxis={'showline': False, 'visible': False},
                         yaxis={'showline': False, 'visible': False,
                                'range': (0, range_max)},
                         bargap=0.15, title=title + str(initial_year),
                         updatemenus=[dict(type="buttons",
                                           buttons=[dict(label="Play",
                                                         method="animate",
                                                         args=[None, {"frame": {"duration": 2000, "redraw": True}, "fromcurrent": True}]),
                                                    dict(label="Stop",
                                                         method="animate",
                                                         args=[None, {"frame": {"duration": 0, "redraw": False}, "mode": "immediate", "transition": {"duration": 0}}])])]),
        frames=list(list_of_frames))

    return fig

Bar race plot

In [29]:
title = 'Race Bar_plot of Cases in Top 10 Areas -- '
list_of_frames = frames_animation(df, title)
fig = bar_race_plot(df, title, list_of_frames)
fig.show()